Data was imported using the \data_gathering.RMD script. See that script for details of collection.
pander(twitter_summary_stats)
| Company | Twitter_Followers | Twitter_Statuses | Twitter_Likes |
|---|---|---|---|
| Labatt USA | 18535 | 2816 | 13417 |
| Molson Canadian | 17258 | 4541 | 8784 |
| Michelob ULTRA | 54561 | 2931 | 41684 |
| Bud Light | 159826 | 17758 | 13085 |
| Twitter_Retweets | Twitter_EngagementPerUser |
|---|---|
| 8645 | 1.19 |
| 4287 | 0.7574 |
| 15411 | 1.046 |
| 5235 | 0.1146 |
pander(summary_stats)
| Company | Comments | Likes | Shares | Total.Posts |
|---|---|---|---|---|
| Labatt USA | 6717 | 127377 | 27884 | 1315 |
| Molson Canadian | 7170 | 60077 | 10678 | 517 |
| Michelob ULTRA | 116516 | 4614127 | 254690 | 3484 |
| Bud Light | 531451 | 20137767 | 1878365 | 6927 |
Taking in raw data and adding a parseable timestamp while filtering on the date and client_ids.
Define functions to create posts per day of week graphs, and timeseries of engagement line graphs.
Shape data into vertical data formats.
##
## Attaching package: 'chron'
## The following objects are masked from 'package:lubridate':
##
## days, hours, minutes, seconds, years
## [1] "tbl_df" "tbl" "data.frame"
First plot is aggregated engagement by content type. Second plot, it engagement by type for client(Labatt).
Looking at the engagement by content type we see that Labatt is garnering its most significant engagment on Photos, Video, and Links.
[ ] TODO: we need to compare posting activity with engagement activity (scatter plot)
Horizontal stacked bar chart for total engagement comparison of all companies
reorder_size <- function(x) {
factor(x, levels = names(sort(table(x))))
}
p <- summary_stats %>%
filter(Engagement != "Total.Posts") %>%
ggplot(., aes(x = Company, y = log(Number), fill = Engagement)) +
geom_bar(stat = "identity") +
xlab('Brand') + ylab('Engagement(Scaled)') +
ggtitle('Logarithmic Transformation of Total Engagement(Facebook)') +
coord_flip()
plot(p)
Total posts per day of the week.
# without brand ID these are uninformative
for(i in seq_along(df_names)) {
p <- day_of_week(df_names[i], client_names[i])
plot(p)
}
p <- ggplot(data = all_companies_ts, aes(x = wday(timestamp, label = TRUE))) +
geom_bar(aes(fill = ..count..)) +
theme(legend.position = "none") +
xlab("Day of the Week") + ylab("Number of Posts") +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4") +
facet_wrap(~from_name, ncol = 4) +
ggtitle("Daily Posting Activity by Brand(Facebook)")
plot(p)
dowDat <- select(all_companies_ts, total_engagement,from_name, timestamp)
dowDat$dow <- wday(dowDat$timestamp, label=TRUE)
dowDat <- aggregate(total_engagement~dow+from_name, data=dowDat, FUN=mean)
p <- ggplot(dowDat, aes(x = dow, y = total_engagement)) +
geom_bar(stat="identity", aes(fill = total_engagement)) +
facet_grid(~from_name) +
ggtitle('Engagements Per Day of Week(Facebook)') +
theme(legend.position = "none") +
xlab("Day of the Week") + ylab("Number of Engagements") +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4")
plot(p)
-[ ] TODO: Create a plot for Post by engagement graphics (scatter plot). To answer the question on days with lots of posts do we get lots of engagment.
mdat <- all_companies_ts
mdat$month <- format(as.POSIXct(mdat$timestamp), '%m')
mdat %>%
ggplot(aes(month, log(total_engagement))) +
geom_boxplot() +
ggtitle('Engagment grouped by Month(Facebook)') + ylab('Engagement') + xlab('Month') +
facet_grid(from_name ~ ., scales = "free")
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
Plots for the timeseries engagement line.
for(i in seq_along(df_names)) {
p <- timeseries_engagement(client_names_proper[i])
plot(p)
}
all_companies_ts <- all_companies_ts %>%
filter(from_id %in% client_ids) %>%
mutate(month = as.Date(cut(all_companies_ts$timestamp, breaks = "month")))
all_companies_ts %>%
select(from_name, month, total_engagement) %>%
group_by(from_name,month) %>%
summarise(totEng = sum(total_engagement)) %>%
ggplot(., aes(x = month, y = totEng)) +
ylab('Total Engagements') + xlab('Years') +
geom_point(aes(color = from_name)) + ylim(0, 2200000) +
ggtitle('Engagement Over Time(Facebook)') +
geom_smooth(aes(color = from_name), se = FALSE)
## Warning: Removed 18 rows containing missing values (geom_smooth).
all_companies_ts %>%
select(from_name, month, total_engagement, timestamp) %>%
filter(from_name != "Bud Light" ) %>%
filter(from_name != "Michelob ULTRA") %>%
filter(year(timestamp) %in% c('2015', '2016')) %>%
group_by(from_name,month) %>%
summarise(totEng = sum(total_engagement)) %>%
ggplot(., aes(x = month, y = totEng)) +
geom_point(aes(color = from_name)) +
geom_smooth(aes(color = from_name), se = FALSE) +
ggtitle("Monthly Facebook Engagement Labatt vs Molson")
What is different about the content during this period?
Might be valuable to look back at the entire timeseries for periods of distinct dynamism.
Removed filter because labatt does not have significant inflection point whereas previous analysis
labatt$timestamp <- date(labatt$timestamp)
labatt_clean_pre <- str_replace_all(labatt$message, "@\\w+", "")
labatt_clean_pre <- gsub("&", "", labatt_clean_pre)
labatt_clean_pre <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", labatt_clean_pre)
labatt_clean_pre <- gsub("@\\w+", "", labatt_clean_pre)
labatt_clean_pre <- gsub("[[:punct:]]", "", labatt_clean_pre)
labatt_clean_pre <- gsub("[[:digit:]]", "", labatt_clean_pre)
labatt_clean_pre <- gsub("http\\w+", "", labatt_clean_pre)
labatt_clean_pre <- gsub("[ \t]{2,}", "", labatt_clean_pre)
labatt_clean_pre <- gsub("^\\s+|\\s+$", "", labatt_clean_pre)
labatt_corpus_pre <- Corpus(VectorSource(labatt_clean_pre))
labatt_corpus_pre <- tm_map(labatt_corpus_pre, removePunctuation)
labatt_corpus_pre <- tm_map(labatt_corpus_pre, content_transformer(tolower))
labatt_corpus_pre <- tm_map(labatt_corpus_pre, removeWords, stopwords("english"))
labatt_corpus_pre <- tm_map(labatt_corpus_pre, removeWords, c("amp", "2yo", "3yo", "4yo"))
labatt_corpus_pre <- tm_map(labatt_corpus_pre, stripWhitespace)
pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(123)
wordcloud(words = labatt_corpus_pre, scale=c(5,0.1), max.words=25, random.order=FALSE,
rot.per=0.35, use.r.layout=FALSE, colors=pal)
Displays engagement per post to find outliers.
p <- ggplot(all_companies_ts, aes(x = month, y = total_engagement)) +
geom_point(aes(color = from_name)) +
xlab("Year") + ylab("Total Engagement") +
theme(legend.title=element_blank(),
legend.text=element_text(size=12),
legend.position=c(0.18, 0.77),
legend.background=element_rect(fill=alpha('gray', 0)))
plot(p)
# q <- aggregate(all_companies_ts$total_engagement~all_companies_ts$month+
# all_companies_ts$from_name,
# FUN=sum)
#
# ggplot(q, aes(x = q$`all_companies_ts$month`, y = q$`all_companies_ts$total_engagement`)) +
# geom_line(aes(color=q$`all_companies_ts$from_name`)) +
# ylab("Total Engagement") + xlab("Year") +
# theme(legend.title=element_blank(),
# legend.text=element_text(size=12),
# legend.position=c(0.18, 0.77),
# legend.background=element_rect(fill=alpha('gray', 0)))
### molson Content Over Time ###
t <- all_companies_ts %>%
filter(., from_name == "Molson Canadian")
t <- data.frame(table(t$month, t$type))
t$Var1 <- date(t$Var1)
ggplot(t, aes(x = Var1, y = Freq, group = Var2)) +
geom_line(aes(color=Var2)) +
ggtitle('Molson Engagement(Facebook)') +
xlab("Year") + ylab("Post Frequency") +
theme(legend.title=element_blank(),
legend.text=element_text(size=12),
legend.position=c(0.18, 0.77),
legend.background=element_rect(fill=alpha('gray', 0)))
#TRISTEN'S GRAPHS!!
#Labatt Content Over Time
### Labatt Content Over Time ###
t <- all_companies_ts %>%
filter(., from_name == "Labatt USA")
t <- data.frame(table(t$month, t$type))
t$Var1 <- date(t$Var1)
ggplot(t, aes(x = Var1, y = Freq, group = Var2)) +
geom_line(aes(color=Var2)) +
ggtitle('Labatt Facebook Activity(Facebook)') +
xlab("Year") + ylab("Post Frequency") +
theme(legend.title=element_blank(),
legend.text=element_text(size=12),
legend.position=c(0.18, 0.77),
legend.background=element_rect(fill=alpha('gray', 0)))
#Labatt Content Over Time
#MichelobULTRA Content Over Time ###
t <- all_companies_ts %>%
filter(., from_name == "Michelob ULTRA")
t <- data.frame(table(t$month, t$type))
t$Var1 <- date(t$Var1)
ggplot(t, aes(x = Var1, y = Freq, group = Var2)) +
geom_line(aes(color=Var2)) +
ggtitle('Michelob ULTRA Engagement(Facebook)') +
xlab("Year") + ylab("Post Frequency") +
theme(legend.title=element_blank(),
legend.text=element_text(size=12),
legend.position=c(0.18, 0.77),
legend.background=element_rect(fill=alpha('gray', 0)))
#Labatt Content Over Time
#Bud Light Content Over Time ###
t <- all_companies_ts %>%
filter(., from_name == "Bud Light")
t <- data.frame(table(t$month, t$type))
t$Var1 <- date(t$Var1)
ggplot(t, aes(x = Var1, y = Freq, group = Var2)) +
geom_line(aes(color=Var2)) +
ggtitle('Bud Light Engagement(Facebook)') +
xlab("Year") + ylab("Post Frequency") +
theme(legend.title=element_blank(),
legend.text=element_text(size=12),
legend.position=c(0.18, 0.77),
legend.background=element_rect(fill=alpha('gray', 0)))
# LabattUSA_timeline %>%
# filter()
#
#
# tweets <- LabattUSA_timeline$text
# match <- regmatches(tweets,gregexpr("#[[:alnum:]]+",tweets))
#
# # Convert the list to a corpus
# # new_corpus <- as.VCorpus(new_list) from Stackoverflow (http://stackoverflow.com/questions/34061912/how-transform-a-list-into-a-corpus-in-r)
#
# new_corpus <- as.VCorpus(match)
# class(new_corpus)
# inspect(new_corpus)
#
# EnsurePackage <- function(x) {
# # EnsurePackage(x) - Installs and loads a package if necessary
# # Args:
# # x: name of package
#
# x <- as.character(x)
# if (!require(x, character.only=TRUE)) {
# install.packages(pkgs=x, repos="http://cran.r-project.org")
# require(x, character.only=TRUE)
# }
# }
#
# MakeWordCloud <- function(corpus) {
# # Make a word cloud
# #
# # Args:
# # textVec: a text vector
# #
# # Returns:
# # A word cloud created from the text vector
#
# EnsurePackage("tm")
# EnsurePackage("wordcloud")
# EnsurePackage("RColorBrewer")
#
# corpus <- tm_map(corpus, function(x) {
# removeWords(x, c("via", "rt", "mt"))
# })
#
# ap.tdm <- TermDocumentMatrix(corpus)
# ap.m <- as.matrix(ap.tdm)
# ap.v <- sort(rowSums(ap.m), decreasing=TRUE)
# ap.d <- data.frame(word = names(ap.v), freq=ap.v)
# table(ap.d$freq)
# pal2 <- brewer.pal(8, "Dark2")
#
# wordcloud(ap.d$word, ap.d$freq,
# scale=c(8, .2), min.freq = 3,
# max.words = Inf, random.order = FALSE,
# rot.per = .15, colors = pal2)
# }
#
# MakeWordCloud(new_corpus)
# p <- unfiltered_ts %>%
# summarise(jd = doy(timestamp)) %>%
# group_by(jd) %>%
# ggplot(aes(factor(jd),total_engagement)) +
# geom_boxplot() +
# facet_grid(~ from_name)
# plot(p)
[ ] Create a data.frame with these columns brand, data, tweet, engagement (I think this is a subset of all_companies)
[ ] summary table of brand, month, totEng, see examples:http://leonawicz.github.io/HtmlWidgetExamples/ex_dt_sparkline.html
all_companies_ts %>%
select(from_name, timestamp, total_engagement) %>%
group_by(from_name, month(timestamp), year(timestamp)) %>%
summarise(count = n(),
engagement = sum(total_engagement)) %>%
ggplot(., aes(y = log(engagement), x = log(count), colour = from_name)) +
geom_point() +
xlab('Post Activity') + ylab('Engagement') +
geom_smooth(se = FALSE, method = "lm") +
#geom_smooth(se = FALSE)
ggtitle("Engagement vs Post Acitivity(Facebook)")
all_companies_ts %>%
#filter(from_name != "Bud Light" ) %>%
#filter(from_name != "Michelob ULTRA") %>%
select(from_name, timestamp, total_engagement) %>%
group_by(from_name, month(timestamp), year(timestamp)) %>%
summarise(count = n(),
engagement = sum(total_engagement)) %>%
ggplot(., aes(y = log(engagement), x = log(count), colour = from_name)) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
ggtitle("Engagement vs Post Acitivity(Facebook)") +
ylab("Total Engagement") + xlab("Total Monthly Posts")
all_companies_ts %>%
filter(from_name == "Labatt USA" ) %>%
select(from_name, timestamp, type, total_engagement) %>%
group_by(from_name, month(timestamp), year(timestamp), type) %>%
summarise(count = n(),
engagement = sum(total_engagement)) %>%
ggplot(., aes(y = log(engagement), x = log(count), colour = type)) +
geom_point() +
geom_smooth(se = FALSE, method = "lm") +
ggtitle("Post Efficacy by type for Labatt USA(Facebook)") +
ylab("Total Engagement") + xlab("Total Monthly Posts")
all_companies_ts %>%
filter(from_name == "Labatt USA" ) %>%
select(from_name, tod, total_engagement) %>%
ggplot(., aes(y = total_engagement, x = factor(tod), colour = from_name)) +
geom_boxplot() +
ylim(c(0,2000)) +
ggtitle("Post Efficacy by type for Labatt USA(Facebook)") +
ylab("Total Engagement") + xlab("Time of Day")
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
# load('processed_data/bud_fb.RData')
# bud$total_engagement <- rowSums(bud[,9:11])
# z <- bud %>%
# arrange(desc(total_engagement))
# head(z)
# Updated upstream
text_clean <- function(cleanliness) {
cleanliness <- str_replace_all(cleanliness, "@\\w+", "")
cleanliness <- gsub("&", "", cleanliness)
cleanliness <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", cleanliness)
cleanliness <- gsub("@\\w+", "", cleanliness)
cleanliness <- gsub("[[:punct:]]", "", cleanliness)
cleanliness <- gsub("[[:digit:]]", "", cleanliness)
cleanliness <- gsub("http\\w+", "", cleanliness)
cleanliness <- gsub("[ \t]{2,}", "", cleanliness)
cleanliness <- gsub("^\\s+|\\s+$", "", cleanliness)
return(cleanliness)
}
LabattUSA_timeline$sentiment <- lapply(text_clean(LabattUSA_timeline$text), get_nrc_sentiment)
labatt_sentiment <- data.frame('created' = LabattUSA_timeline$created,
'text' = LabattUSA_timeline$text,
'sentiment' = as.character(LabattUSA_timeline$sentiment))
labatt_sentiment$score <- get_sentiment(as.character(text_clean(labatt_sentiment$text))) %>% as.numeric()
labatt_sentiment %>%
ggplot(aes(as_date(created), score)) +
geom_point() +
geom_smooth() +
scale_color_manual(values = colourList) +
scale_x_date(name = '\nDates', breaks = date_breaks("3 months"), labels = date_format("%Y-%b")) +
scale_y_continuous(name = "Sentiment Score\n", breaks = seq(-5, 5, by = 1)) + theme_bw() +
ggtitle('Labatt USA Sentiment(Twitter)')
Molson_Canadian_timeline$sentiment <- lapply(text_clean(Molson_Canadian_timeline$text), get_nrc_sentiment)
molson_sentiment <- data.frame('created' = Molson_Canadian_timeline$created,
'text' = Molson_Canadian_timeline$text,
'sentiment' = as.character(Molson_Canadian_timeline$sentiment))
molson_sentiment$score <- get_sentiment(as.character(text_clean(molson_sentiment$text))) %>% as.numeric()
molson_sentiment %>%
ggplot(aes(as_date(created), score)) +
geom_point() +
geom_smooth() +
scale_color_manual(values = colourList) +
scale_x_date(name = '\nDates', breaks = date_breaks("3 months"), labels = date_format("%Y-%b")) +
scale_y_continuous(name = "Sentiment Score\n", breaks = seq(-5, 5, by = 1)) + theme_bw() +
ggtitle('Molson Canadian Sentiment(Twitter)')
budlight_timeline$sentiment <- lapply(text_clean(budlight_timeline$text), get_nrc_sentiment)
budlight_sentiment <- data.frame('created' = budlight_timeline$created,
'text' = budlight_timeline$text,
'sentiment' = as.character(budlight_timeline$sentiment))
budlight_sentiment$score <- get_sentiment(as.character(text_clean(budlight_sentiment$text))) %>% as.numeric()
budlight_sentiment %>%
ggplot(aes(as_date(created), score)) +
geom_point() +
geom_smooth() +
scale_color_manual(values = colourList) +
scale_x_date(name = '\nDates', breaks = date_breaks("3 months"), labels = date_format("%Y-%b")) +
scale_y_continuous(name = "Sentiment Score\n", breaks = seq(-5, 5, by = 1)) + theme_bw() +
ggtitle('Bud Light Sentiment(Twitter)')
MichelobULTRA_timeline$sentiment <- lapply(text_clean(MichelobULTRA_timeline$text), get_nrc_sentiment)
michelob_sentiment <- data.frame('created' = MichelobULTRA_timeline$created,
'text' = MichelobULTRA_timeline$text,
'sentiment' = as.character(MichelobULTRA_timeline$sentiment))
michelob_sentiment$score <- get_sentiment(as.character(text_clean(michelob_sentiment$text))) %>% as.numeric()
michelob_sentiment %>%
ggplot(aes(as_date(created), score)) +
geom_point() +
geom_smooth() +
scale_color_manual(values = colourList) +
scale_x_date(name = '\nDates', breaks = date_breaks("3 months"), labels = date_format("%Y-%b")) +
scale_y_continuous(name = "Sentiment Score\n", breaks = seq(-5, 5, by = 1)) + theme_bw() +
ggtitle('Michelob ULTRA Sentiment(Twitter)\n')